################################################
# Analysis for:
#
# Jordan, Soren. 2019. "Leadership PAC Donations and Party Status: A Technical 
#  and Theoretical Extension." Research & Politics 6 (4): 1-9. 
#  DOI: 10.1177/2053168019889558
#
# Required files: LPACsData.csv
#                 LPACsAnalysis.rdata
#
################################################

library(pscl)
library(lme4)
library(ggplot2)
library(MASS)
library(gridExtra)
library(sjPlot)
library(ATE)
library(Matching)
library(ggpubr)
library(boot)
library(reshape)

data.csv <- read.csv("LPACsData.csv")
load("LPACsAnalysis.rdata") # For theta

################################################################################
################################################################################
############################ Section 1. Replication ############################
################################################################################
################################################################################

#############################################
# Table 1. Strict ABLR Replication
#############################################

# Authors center some of the variables. This is literally their code from their replication file
data.recode <- data.csv
data.recode$incumbent[data.recode$incumbent == 0] <- -1.23635189467216
data.recode$incumbent[data.recode$incumbent == 1] <- 0.808456751952548
data.recode$rating.num3[data.recode$rating.num3 == 1] <- -2.39663395566064
data.recode$rating.num3[data.recode$rating.num3 == 2] <- -0.929722835165604
data.recode$rating.num3[data.recode$rating.num3 == 3] <- 0.537188285329432
data.recode$abs.cfdist <- (data.recode$abs.cfdist - mean(data.recode$abs.cfdist)) * 1/sd(data.recode$abs.cfdist)
data.recode$margin <- (data.recode$margin - mean(data.recode$margin)) * 1/sd(data.recode$margin)

model.dist.re <- glmer(amount.sum.x ~ (abs.cfdist | minority) + rating.num3 + exp.gain.seats +  
    incumbent + party + margin, data = data.recode, family=negative.binomial(theta))

summary(model.dist.re)

model.rat.re <- glmer(amount.sum.x ~ (rating.num3 | minority) + abs.cfdist + exp.gain.seats +  
    incumbent + party + margin, data = data.recode, family=negative.binomial(theta))

summary(model.rat.re)

# These are identical to fixed effects. The random effects don't add anything. There are only two groups
#  so when we draw "random" intercepts we're not really shifting anything. 
#  This IS doing something with the group-level variances, though. 
#  See Gelman and Hill, 275. There isn't too much to do when the number of groups is
#  small, as we can't get good estimates of the variance. 

dist.names <- c(colnames(coef(model.dist.re)$minority)[3], colnames(coef(model.rat.re)$minority)[3], 
	colnames(coef(model.dist.re)$minority)[4:7], 
	colnames(coef(model.dist.re)$minority)[2], colnames(coef(model.dist.re)$minority)[1], 
	colnames(coef(model.dist.re)$minority)[1], 
	colnames(coef(model.rat.re)$minority)[1], 
	colnames(coef(model.rat.re)$minority)[1])

dist.coef <- c(coef(model.dist.re)$minority[1,3], 0, 
	coef(model.dist.re)$minority[1,4:7], 
	13.54421, coef(model.dist.re)$minority[1,1], 
	coef(model.dist.re)$minority[2,1], 0, 0) # Change intercept to global 

dist.coef.round <- unlist(lapply(dist.coef, function(num) {round(num, digits = 3)}))

dist.se <- c(round(coef(summary(model.dist.re))[2,2], digits = 3), " ",
	round(coef(summary(model.dist.re))[3:6,2], digits = 3), 
	round(coef(summary(model.dist.re))[1,2], digits = 3),
	0.001, 0.001, " ", " ")

rat.coef <- c(0, coef(model.rat.re)$minority[1,3], 
	coef(model.rat.re)$minority[1,4:7], 
	10.075, 0, 0, 
	coef(model.rat.re)$minority[1,1], 
	coef(model.rat.re)$minority[2,1]) # Change intercept to global 

rat.coef.round <- unlist(lapply(rat.coef, function(num) {round(num, digits = 3)}))

rat.se <- c(" ", round(coef(summary(model.rat.re))[2:6,2], digits = 3),
	round(coef(summary(model.rat.re))[1,2], digits = 3),
	" ", " ", 0.008, 0.008)

for(i in 1:length(rat.coef.round)) {
	print(paste(dist.names[i], dist.coef.round[i], rat.coef.round[i], "\\", sep = " & "))
	print(paste(" ", paste("(", dist.se[i], ")", sep = ""), paste("(", rat.se[i], ")", sep = ""), "\\", sep = " & "))
}


length(fitted(model.dist.re))
length(fitted(model.rat.re))

AIC(model.dist.re)
AIC(model.rat.re)



################################################################################
################################################################################
######################### Section 2. Original Analysis #########################
################################################################################
################################################################################

# Two obscene cases
data.csv$amount.sum.x[data.csv$amount.sum.x > 120000]

data.csv$minority.01 <- as.numeric(data.csv$minority) - 1
data.csv$exp.gain.01 <- as.numeric(data.csv$exp.gain.seats) - 1
data.csv$repub <- as.numeric(data.csv$party) - 1

summary(data.csv$amount.sum.x)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#      0       0    5000   17010   25000 2166000 
summary(data.csv$abs.cfdist)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#0.00000 0.09735 0.21320 0.27560 0.39030 3.40700
summary(data.csv$minority.01)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.0000  0.0000  0.0000  0.4866  1.0000  1.0000 
summary(data.csv$exp.gain.01)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.0000  0.0000  1.0000  0.6102  1.0000  1.0000 
summary(data.csv$incumbent)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.0000  0.0000  1.0000  0.6046  1.0000  1.0000 
summary(data.csv$repub)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
# 0.0000  0.0000  1.0000  0.5796  1.0000  1.0000 
summary(data.csv$margin)
#  Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  14.00   14.00   19.00   24.21   40.00   40.00 
summary(data.csv$rating.num3)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  1.000   2.750   3.000   2.634   3.000   3.000 

summary(subset(data.csv$amount.sum.x, data.csv$amount.sum.x > 0 & data.csv$minority.01 ==1))





#############################################
# Figure 1. Descriptives
#############################################

# Exclude the two multimillion dollar cases
data.reduc <- data.frame(subset(data.csv$amount.sum.x, data.csv$amount.sum.x < 200000))

hist1 <- ggplot(data = data.reduc, aes(subset.data.csv.amount.sum.x..data.csv.amount.sum.x...2e.05.)) +
	geom_histogram() +
	labs(x = "LPAC Donation Amount", y = "Count", caption = "Excludes two extreme values. Details in text.") +
	theme_bw()

hist2 <- ggplot(data = data.csv, aes(abs.cfdist)) +
	geom_histogram() +
	labs(x = "Ideological Distance", y = "Count") +
	theme_bw()

grid.arrange(hist1, hist2, ncol = 2)
dev.off()


maj.donations <- data.csv$amount.sum.x[data.csv$minority.01 == 0 & data.csv$abs.cfdist > mean(data.csv$abs.cfdist) & data.csv$amount.sum.x < 200000]
length(maj.donations)
length(maj.donations[maj.donations > 0])/length(maj.donations)
median(maj.donations[maj.donations > 0])


#############################################
# Table 2. Triple Interaction + Zero-Inflation
#############################################

triple.inc <- zeroinfl(amount.sum.x ~ incumbent*minority.01*abs.cfdist + exp.gain.01 + rating.num3 + repub + margin, data = data.csv, dist = "negbin")
summary(triple.inc)

# Build a table
meow <- summary(triple.inc)

for(i in 1:length(meow$coefficients$count[,1])) {
	print(paste(rownames(meow$coefficients$count)[i], round(meow$coefficients$count[i,1], digits = 3), 
		paste("(", round(meow$coefficients$count[i,2], digits = 3), ")", sep = ""), ifelse(abs(meow$coefficients$count[i, 3]) > 2, "$^{*}$ \\", "\\"), sep = " & "))
}

for(i in 1:length(meow$coefficients$zero[,1])) {
	print(paste(rownames(meow$coefficients$zero)[i], round(meow$coefficients$zero[i,1], digits = 3), 
		paste("(", round(meow$coefficients$zero[i,2], digits = 3), ")", sep = ""), ifelse(abs(meow$coefficients$zero[i, 3]) > 2, "$^{*}$ \\", "\\"), sep = " & "))
}

AIC(triple.inc)
length(fitted(triple.inc))


## Generate interpretations
# Over the set of distance from 0 to 1.5
# We will consider both minority and majority parties
# We will consider both incumbents and not
# We will consider ratings 1 and 3
# Republican held constant
# Marginality held constant at 24.21 (mean)

cf.dist <- rep(seq(0, 1.5, 0.1), 8)
minor <- c(rep(0, length(cf.dist)/4), rep(1, length(cf.dist)/4))
incumb <- c(rep(0, length(cf.dist)/8), rep(1, length(cf.dist)/8))
rating.num3 <- c(rep(1, length(cf.dist)/2), rep(3, length(cf.dist)/2))
exp.gain <- 1
rep <- 1
marg <- 24.21
cf.minor <- cf.dist*minor
cf.inc <- cf.dist*incumb
minor.inc <- minor*incumb
cf.min.inc <- cf.dist*minor*incumb

X.triple <- cbind(1, incumb, minor, cf.dist, exp.gain, rating.num3, rep, marg, minor.inc, cf.inc, cf.minor, cf.min.inc)

names(triple.inc$coefficients$zero)
names(triple.inc$coefficients$count)

Xb.triple.zero <- X.triple%*%triple.inc$coefficients$zero
Xb.triple.count <- X.triple%*%triple.inc$coefficients$count
theta.triple <- triple.inc$theta
alpha.triple <- 1/theta.triple

# Zero equation = logit of Xb. Predicting certain zeroes
p.triple <- 1/(1 + exp(-Xb.triple.zero))

# Count equation: predicting expected donation
exp.counts.triple <- exp(Xb.triple.count)*(1-p.triple)

gg.full <- data.frame(round(exp.counts.triple), p.triple, minor, cf.dist, incumb, rating.num3)
gg.full$minor[gg.full$minor == 0] <- "Majority"
gg.full$minor[gg.full$minor == 1] <- "Minority"
gg.full$incumb[gg.full$incumb == 0] <- "Challenger"
gg.full$incumb[gg.full$incumb == 1] <- "Incumbent"
gg.full$rating.num3[gg.full$rating.num3 == 1] <- "Toss Up"
gg.full$rating.num3[gg.full$rating.num3 == 3] <- "Safe"


# Generate errors with bootstrapping
# Bootstrap the rating == 1 races first
boot.triple <- function(formula, data, indices) {
	data.sub <- data[indices,]
	model.sub <- zeroinfl(formula, data = data.sub, dist = "negbin")
	Xb.zero <- X.triple%*%model.sub$coefficients$zero
	Xb.count <- X.triple%*%model.sub$coefficients$count
	logit.zero <- 1/(1 + exp(-Xb.zero))
	count.zero <- exp(Xb.count)*(1-logit.zero)
	return(c(logit.zero, count.zero))	
}	

set.seed(06062019) # Today's date (first analysis)
results.triple <- boot(data = data.csv, statistic = boot.triple, R = 10000,
	formula = amount.sum.x ~ incumbent*minority.01*abs.cfdist + exp.gain.01 + rating.num3 + repub + margin)

# https://www.tau.ac.il/~saharon/Boot/10.1.1.133.8405.pdf use the percentile intervals
zero.l90 <- zero.u90 <- count.l90  <- count.u90 <- rep(NA, length(X.triple[,1]))

for(i in 1:length(zero.l90)) {
	zero.l90[i] <- boot.ci(results.triple, index = i, conf = 0.90, type = "perc")$percent[4] # Lower
	zero.u90[i] <- boot.ci(results.triple, index = i, conf = 0.90, type = "perc")$percent[5] # Upper
	count.l90[i] <- boot.ci(results.triple, index = (i + length(zero.l90)), conf = 0.90, type = "perc")$percent[4] # Lower
	count.u90[i] <- boot.ci(results.triple, index = (i + length(zero.l90)), conf = 0.90, type = "perc")$percent[5] # Upper
}

# ggplot wants to map the error bar aesthetic relative to y. so it's easier if we pass them as a difference
gg.full$p.triple.lower.90 <- abs(p.triple - zero.l90)
gg.full$p.triple.upper.90 <- abs(zero.u90 - p.triple)
gg.full$count.lower.90 <- abs(exp.counts.triple - count.l90)
gg.full$count.upper.90 <- abs(count.u90 - exp.counts.triple)


#############################################
# Figures 2 and 3. Zero Equation and Count Equation
#############################################

ggplot(data = gg.full, aes(cf.dist, p.triple, color = factor(incumb), lty = factor(rating.num3))) + 
	geom_errorbar(aes(ymin = p.triple - p.triple.lower.90, 
		ymax = p.triple + p.triple.upper.90), width = 0.1) + 	
	geom_line(size = 1.5) + 
	scale_colour_grey(start = 0.6, end = 0) +
	facet_wrap(~minor) +
	labs(lty = "House Rating", color = "Incumbent", x = "Ideological Distance", y = "Pr(Never Donation)") +
	theme_bw()
dev.off()

ggplot(data = gg.full, aes(cf.dist, round.exp.counts.triple., color = factor(incumb), lty = factor(rating.num3))) + 
	geom_errorbar(aes(ymin = round.exp.counts.triple. - count.lower.90, 
		ymax = round.exp.counts.triple. + count.upper.90), width = 0.1) + 	
	geom_line(size = 1.5) + 
	scale_colour_grey(start = 0.6, end = 0) +
	facet_wrap(~minor) +
	labs(lty = "House Rating", color = "Incumbent", x = "Ideological Distance", y = "Expected Donation") +
	theme_bw()
dev.off()



# Explore some more. First, fit the value for just the zero equation for all observations

X.for.zeroes <- cbind(1, data.csv$incumbent, data.csv$minority.01, data.csv$abs.cfdist, 	
	data.csv$exp.gain.01, data.csv$rating.num3, data.csv$repub, data.csv$margin, 
	data.csv$incumbent*data.csv$minority.01, data.csv$incumbent*data.csv$abs.cfdist, 
	data.csv$minority.01*data.csv$abs.cfdist, data.csv$incumbent*data.csv$abs.cfdist*data.csv$minority.01)

X.never.donate <- X.for.zeroes%*%triple.inc$coefficients$zero
p.zero <- 1/(1 + exp(-X.never.donate))

# A chunk of observations <very> likely to receive donations, and a lot very unlikely (pr > 0.50)

# It really is incumbency and competitiveness that are sorting the zero races from the others. 

not.safe <- c(data.csv$amount.sum.x[data.csv$incumbent == 0 & data.csv$rating.num3 == 1], 
	data.csv$amount.sum.x[data.csv$incumbent == 1 &	data.csv$rating.num3 == 1],
	data.csv$amount.sum.x[data.csv$incumbent == 0 & data.csv$rating.num3 == 2],
	data.csv$amount.sum.x[data.csv$incumbent == 1 & data.csv$rating.num3 == 2])

safe <- c(data.csv$amount.sum.x[data.csv$incumbent == 0 & data.csv$rating.num3 == 3],
	data.csv$amount.sum.x[data.csv$incumbent == 1 & data.csv$rating.num3 == 3])


table(not.safe)[1]/length(not.safe) # 6 percent of competitive races are zero
table(safe)[1]/length(safe) # 57 percent of safe races are

# So this unilaterally moves you out of the zero state

# 44 percent of the sample is zero
table(data.csv$amount.sum.x)[1]/length(data.csv$amount.sum.x)


# Let's make a plot to look at the triple interaction
# First, make two versions of distance that are factors, so we can facet them on ggplot
#  Notice, the numbers we're assigning do not matter, since ggplot will treat them as factors
cfdist.chunks <- cfdist.binary <- cfdist.quarts.ext <- data.csv$abs.cfdist
cfdist.chunks[cfdist.chunks < 0.3] <- 9
cfdist.chunks[cfdist.chunks < 0.6] <- 10
cfdist.chunks[cfdist.chunks < 0.9] <- 11
cfdist.chunks[cfdist.chunks < 1.2] <- 12
cfdist.chunks[cfdist.chunks < 9] <- 13
cfdist.binary[cfdist.binary < mean(data.csv$abs.cfdist)] <- 0
cfdist.binary[cfdist.binary > 0] <- 1
cfdist.quarts.ext[cfdist.quarts.ext < quantile(data.csv$abs.cfdist, 0.25)] <- -2
cfdist.quarts.ext[cfdist.quarts.ext > quantile(data.csv$abs.cfdist, 0.75)] <- -1
cfdist.quarts.ext[cfdist.quarts.ext > 0] <- NA

# Plot data
data.ggplot <- data.frame(data.csv$amount.sum.x, data.csv$incumbent, data.csv$minority.01, p.zero, data.csv$abs.cfdist, 
	fitted(model.dist.re), cfdist.chunks, cfdist.binary, cfdist.quarts.ext, data.csv$rating.num3)
names(data.ggplot) <- c("Amount", "Incumbent", "Minority", "Pr.Zero", "CfDist", "Fitted", "Factor.CfDist",
	"Binary.CfDist", "Quartile.CfDist", "Rating")

# Sub-plot data
data.ggplot.minority <- subset(data.ggplot, data.ggplot$Minority == 1)
data.ggplot.majority <- subset(data.ggplot, data.ggplot$Minority == 0)

# Minority party, ABLR model fitted values

#############################################
# Figure 4. Comparison of predictions
#############################################
median(fitted(triple.inc)[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 1])
median(data.csv$amount.sum.x[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 1])
mean(fitted(triple.inc)[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 1])
mean(data.csv$amount.sum.x[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 1])


median(fitted(triple.inc)[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 0])
median(data.csv$amount.sum.x[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 0])
mean(fitted(triple.inc)[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 0])
mean(data.csv$amount.sum.x[data.csv$amount.sum.x > 0 & data.csv$minority.01 == 0])


length(data.csv$amount.sum.x[data.csv$amount.sum.x > 125000]) # Omits two real cases
length(fitted(model.dist.re)[fitted(model.dist.re) > 125000]) 
# Omits 25 higher predictions (that would drastically rescale the y-axis)
#  These are wrong, anyway, so we're not shorting ABLR by not plotting them
cbind(fitted(model.dist.re)[fitted(model.dist.re) > 125000], data.csv$amount.sum.x[fitted(model.dist.re) > 125000])


theirs.min <- ggplot(data = na.omit(data.ggplot.minority), aes(Fitted, Amount, color = factor(Incumbent), 
		shape = factor(Quartile.CfDist))) +
	geom_rect(xmin = -2000, xmax = 31000, ymin = -2000, ymax = 2000, alpha = 0) +
	geom_point(size = 1) +
	scale_colour_grey(name = "Incumbent Status", labels = c("Challenger", "Incumbent"), start = 0.6, end = 0) +
	scale_shape_discrete(name  ="Distance", labels=c("Most Similar", "Least Similar")) +
	xlim(0, 125000) +
	ylim(0, 125000) +
	geom_abline(slope = 1) +
	labs(x = "ABLR Expected Donation", y = "Actual Donation (Minority Party)") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

theirs.maj <- ggplot(data = na.omit(data.ggplot.majority), aes(Fitted, Amount, color = factor(Incumbent), 
		shape = factor(Quartile.CfDist))) +
	geom_rect(xmin = -2000, xmax = 80000, ymin = -2000, ymax = 2000, alpha = 0) +
	geom_point(size = 1) +
	scale_colour_grey(name = "Incumbent Status", labels = c("Challenger", "Incumbent"), start = 0.6, end = 0) +
	scale_shape_discrete(name  ="Distance", labels=c("Most Similar", "Least Similar")) +
	xlim(0, 125000) +
	ylim(0, 125000) +
	geom_abline(slope = 1) +
	labs(x = "ABLR Expected Donation", y = "Actual Donation (Majority Party)") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

mine.min <- ggplot(data = na.omit(data.ggplot.minority), aes(Pr.Zero, Amount, color = factor(Incumbent), 
		shape = factor(Quartile.CfDist))) +
	geom_rect(xmin = 0.475, xmax = 1.025, ymin = -2000, ymax = 2000, alpha = 0) +
	geom_point(size = 1) +
	scale_colour_grey(name = "Incumbent Status", labels = c("Challenger", "Incumbent"), start = 0.6, end = 0) +
	scale_shape_discrete(name  ="Distance", labels=c("Most Similar", "Least Similar")) +
	xlim(0, 1) +
	ylim(0, 125000) +
	labs(x = "Pr(Never Donation)", y = "Actual Donation (Minority Party)") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

mine.maj <- ggplot(data = na.omit(data.ggplot.majority), aes(Pr.Zero, Amount, color = factor(Incumbent), 
		shape = factor(Quartile.CfDist))) +
	geom_rect(xmin = 0.475, xmax = 1.025, ymin = -2000, ymax = 2000, alpha = 0) +
	geom_point(size = 1) +
	scale_colour_grey(name = "Incumbent Status", labels = c("Challenger", "Incumbent"), start = 0.6, end = 0) +
	scale_shape_discrete(name  ="Distance", labels=c("Most Similar", "Least Similar")) +
	xlim(0, 1) +
	ylim(0, 125000) +
	labs(x = "Pr(Never Donation)", y = "Actual Donation (Majority Party)") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

ggarrange(theirs.min, mine.min, theirs.maj, mine.maj, ncol = 2, nrow = 2, 
	common.legend = TRUE, legend = "bottom")
dev.off()


# Some final quantities to discuss

# How much better do we do?
# Minority
# ABLR
length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0])/length(data.ggplot.minority$Amount) # 44% are 0
length(data.ggplot.minority$Fitted[data.ggplot.minority$Amount == 0 & data.ggplot.minority$Fitted > 2000])/length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0]) # 91% of are predicted to be over $2000

# Zero
length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0])/length(data.ggplot.minority$Amount) # 44% are 0
length(data.ggplot.minority$Fitted[data.ggplot.minority$Amount == 0 & data.ggplot.minority$Pr.Zero > 0.50])/length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0]) # 78% of are predicted not to get a donation

# Theory
length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0 & data.ggplot.minority$Rating == 3])/
length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0])	# 97 %of minority 0s are safe

length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0 & data.ggplot.minority$Rating == 1])/
length(data.ggplot.minority$Amount[data.ggplot.minority$Amount == 0])	# 0% of minority 0s are toss-up



# Majority
# ABLR
length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0])/length(data.ggplot.majority$Amount) # 44% are 0
length(data.ggplot.majority$Fitted[data.ggplot.majority$Amount == 0 & data.ggplot.majority$Fitted > 2000])/length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0]) # 99% of are predicted to be over $2000

# Zero
length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0])/length(data.ggplot.majority$Amount) # 44% are 0
length(data.ggplot.majority$Fitted[data.ggplot.majority$Amount == 0 & data.ggplot.majority$Pr.Zero > 0.50])/length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0]) # 57% of are predicted not to get a donation

# Theory
length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0 & data.ggplot.majority$Rating == 3])/
length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0])	# 95 %of majority 0s are safe

length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0 & data.ggplot.majority$Rating == 1])/
length(data.ggplot.majority$Amount[data.ggplot.majority$Amount == 0])	# 2 %of majority 0s are toss-up










#############################################
# Appendix. Model comparisons
#############################################
# Main model. 
triple.inc <- zeroinfl(amount.sum.x ~ incumbent*minority.01*abs.cfdist + exp.gain.01 + rating.num3 + repub + margin, data = data.csv, dist = "negbin")
summary(triple.inc)
AIC(triple.inc)
length(fitted(triple.inc))

# From THEIR appendix: two <separate> hurdle models, like in the main text (for cfdist and house rating separately)
#  This is straight from the ABLR replication file for the JOP appendix
#  We're also going to use their dataset (from the loaded .rdata) for complete clarity
hurdle.abs.cfdist <- hurdle(formula = amount.sum.x ~ abs.cfdist + minority + exp.gain.seats + incumbent + party + margin, 
	data = hc06.12.general, dist = "negbin")                   
summary(hurdle.abs.cfdist)
AIC(hurdle.abs.cfdist)
length(fitted(hurdle.abs.cfdist))

hurdle.rating <-  hurdle(formula = amount.sum.x ~ rating.num3 + minority  + exp.gain.seats + incumbent + party + margin, 
	data = hc06.12.general, dist = "negbin")
summary(hurdle.rating) 
AIC(hurdle.rating)
length(fitted(hurdle.rating))


# Extension 1. Zero-inflation for their specification
zeroinfl.abs.cfdist <- zeroinfl(formula = amount.sum.x ~ abs.cfdist + minority + exp.gain.seats + incumbent + party + margin, 
	data = hc06.12.general, dist = "negbin")                   
summary(zeroinfl.abs.cfdist)
AIC(zeroinfl.abs.cfdist)
length(fitted(zeroinfl.abs.cfdist))

zeroinfl.rating <- zeroinfl(formula = amount.sum.x ~ rating.num3 + minority  + exp.gain.seats + incumbent + party + margin, 
	data = hc06.12.general, dist = "negbin")
summary(zeroinfl.rating) 
AIC(zeroinfl.rating)
length(fitted(zeroinfl.rating))

# These two (hurdle and zeroinfl) are nearly identical when the specifications are separate. Of course, theory suggests
#  that isn't a good idea!


# Extension 2. Negtive binomial (no zero inflation) with the interactive specification
triple.nb <- glm.nb(amount.sum.x ~ incumbent*minority.01*abs.cfdist + exp.gain.01 + rating.num3 + repub + margin, data = data.csv)
summary(triple.nb)
AIC(triple.nb)
length(fitted(triple.nb))

# Extension 3. Hurdle with the interactive specification
triple.hurdle <- hurdle(amount.sum.x ~ incumbent*minority.01*abs.cfdist + exp.gain.01 + rating.num3 + repub + margin, data = data.csv, dist = "negbin") 
summary(triple.hurdle)
AIC(triple.hurdle)
length(fitted(triple.hurdle))


# Save the fitted predictions from the unconditional hurdle models (ABLR original) and the triple interation (hurdle and zero-inflated)
triple.zero.preds <- predict(triple.inc)
triple.hurdle.preds <- predict(triple.hurdle)
hurdle.rat.preds <- predict(hurdle.rating)
hurdle.dist.preds <- predict(hurdle.abs.cfdist)


#############################################
# Appendix Table A1. Model comparisons
#############################################
# Use the following to build a separate table in Excel: there's just too many different coefficients with too many different
#  names to combine here

# Model results: triple zero-inflated model
t.z.count <- (melt(round(coef(summary(triple.inc))$count[,1:2], digits = 3)))
t.z.zero <- (melt(round(coef(summary(triple.inc))$zero[,1:2], digits = 3)))
rbind(t.z.count[order(t.z.count$X1),], t.z.zero[order(t.z.zero$X1),])[,c(1, 3)]
AIC(triple.inc)
length(fitted(triple.inc))

# Model results: their appendix model of hurdle with distance
hd.app.count <- (melt(round(coef(summary(hurdle.abs.cfdist))$count[,1:2], digits = 3)))
hd.app.zero <- (melt(round(coef(summary(hurdle.abs.cfdist))$zero[,1:2], digits = 3)))
rbind(hd.app.count[order(hd.app.count$X1),], hd.app.zero[order(hd.app.zero$X1),])[,c(1, 3)]
AIC(hurdle.abs.cfdist)
length(fitted(hurdle.abs.cfdist))

# Model results: their appendix model of hurdle with rating
hr.app.count <- (melt(round(coef(summary(hurdle.rating))$count[,1:2], digits = 3)))
hr.app.zero <- (melt(round(coef(summary(hurdle.rating))$zero[,1:2], digits = 3)))
rbind(hr.app.count[order(hr.app.count$X1),], hr.app.zero[order(hr.app.zero$X1),])[,c(1, 3)]
AIC(hurdle.rating)
length(fitted(hurdle.rating))

# Model results: extension of their model of zero-inflation with distance
zid.app.count <- (melt(round(coef(summary(zeroinfl.abs.cfdist))$count[,1:2], digits = 3)))
zid.app.zero <- (melt(round(coef(summary(zeroinfl.abs.cfdist))$zero[,1:2], digits = 3)))
rbind(zid.app.count[order(zid.app.count$X1),], zid.app.zero[order(zid.app.zero$X1),])[,c(1, 3)]
AIC(zeroinfl.abs.cfdist)
length(fitted(zeroinfl.abs.cfdist))

# Model results: extension of their model of zero-inflation with rating
zir.app.count <- (melt(round(coef(summary(zeroinfl.rating))$count[,1:2], digits = 3)))
zir.app.zero <- (melt(round(coef(summary(zeroinfl.rating))$zero[,1:2], digits = 3)))
rbind(zir.app.count[order(zir.app.count$X1),], zir.app.zero[order(zir.app.zero$X1),])[,c(1, 3)]
AIC(zeroinfl.rating)
length(fitted(zeroinfl.rating))

# Model results: negative binomial, with interaction, no zero inflation
t.nb <- (melt(round(coef(summary(triple.nb))[,1:2], digits = 3)))
t.nb[order(t.nb$X1),c(1, 3)]
AIC(triple.nb)
length(fitted(triple.nb))

# Model results: triple interctive model with hurdle specification
t.h.count <- (melt(round(coef(summary(triple.hurdle))$count[,1:2], digits = 3)))
t.h.zero <- (melt(round(coef(summary(triple.hurdle))$zero[,1:2], digits = 3)))
rbind(t.h.count[order(t.h.count$X1),], t.h.zero[order(t.h.zero$X1),])[,c(1, 3)]
AIC(triple.hurdle)
length(fitted(triple.hurdle))

c(AIC(triple.inc), AIC(triple.hurdle), AIC(hurdle.abs.cfdist), AIC(zeroinfl.abs.cfdist),
	AIC(hurdle.rating), AIC(zeroinfl.rating), 
	AIC(triple.nb))

c(length(fitted(triple.inc)), length(fitted(hurdle.abs.cfdist)), length(fitted(hurdle.rating)),
	length(fitted(zeroinfl.abs.cfdist)), length(fitted(zeroinfl.rating)), 
	length(fitted(triple.nb)), length(fitted(triple.hurdle)))


c(sqrt(sum((fitted(triple.inc) - data.csv$amount.sum.x)^2)/
		(length(fitted(triple.inc)))),
	sqrt(sum((fitted(hurdle.abs.cfdist) - data.csv$amount.sum.x)^2)/
		(length(fitted(hurdle.abs.cfdist)))),
	sqrt(sum((fitted(hurdle.rating) - data.csv$amount.sum.x)^2)/
		(length(fitted(hurdle.rating)))),
	sqrt(sum((fitted(zeroinfl.abs.cfdist) - data.csv$amount.sum.x)^2)/
		(length(fitted(zeroinfl.abs.cfdist)))),
	sqrt(sum((fitted(zeroinfl.rating) - data.csv$amount.sum.x)^2)/
		(length(fitted(zeroinfl.rating)))),
	sqrt(sum((fitted(triple.nb) - data.csv$amount.sum.x)^2)/
		(length(fitted(triple.nb)))),
	sqrt(sum((fitted(triple.hurdle) - data.csv$amount.sum.x)^2)/
		(length(fitted(triple.hurdle)))))

#############################################
# Appendix Figure A1. Model predictions versus actual
#############################################
# Plot data
data.ggplot.appendix <- data.frame(data.csv$amount.sum.x, triple.zero.preds, triple.hurdle.preds, hurdle.rat.preds, hurdle.dist.preds)
names(data.ggplot.appendix) <- c("Amount", "ZeroInflPreds", "HurdlePreds", "HurdleRatPreds", "HurdleDistPreds")

length(data.csv$amount.sum.x[data.csv$amount.sum.x > 125000]) # Omits two real cases
length(data.ggplot.appendix$ZeroInflPreds[data.ggplot.appendix$ZeroInflPreds > 120000]) # No predictions
length(data.ggplot.appendix$HurdlePreds[data.ggplot.appendix$HurdlePreds > 120000]) # No predictions
length(data.ggplot.appendix$HurdleRatPreds[data.ggplot.appendix$HurdleRatPreds > 120000]) # No predictions
length(data.ggplot.appendix$HurdleDistPreds[data.ggplot.appendix$HurdleDistPreds > 120000]) # No predictions

triple.zero.plot <- ggplot(data = na.omit(data.ggplot.appendix), aes(ZeroInflPreds, Amount)) +
	geom_point(size = 1) +
	xlim(0, 125000) +
	ylim(0, 125000) +
	geom_abline(slope = 1) +
	labs(x = "Predicted Donation (Zero-Inflated, Interactive)", y = "Actual Donation") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

triple.hurdle.plot <- ggplot(data = na.omit(data.ggplot.appendix), aes(HurdlePreds, Amount)) +
	geom_point(size = 1) +
	xlim(0, 125000) +
	ylim(0, 125000) +
	geom_abline(slope = 1) +
	labs(x = "Predicted Donation (Hurdle, Interactive)", y = "Actual Donation") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

hurdle.rat.plot <- ggplot(data = na.omit(data.ggplot.appendix), aes(HurdleRatPreds, Amount)) +
	geom_point(size = 1) +
	xlim(0, 125000) +
	ylim(0, 125000) +
	geom_abline(slope = 1) +
	labs(x = "Predicted Donation (Hurdle, Only Rating)", y = "Actual Donation") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

hurdle.dist.plot <- ggplot(data = na.omit(data.ggplot.appendix), aes(HurdleDistPreds, Amount)) +
	geom_point(size = 1) +
	xlim(0, 125000) +
	ylim(0, 125000) +
	geom_abline(slope = 1) +
	labs(x = "Predicted Donation (Hurdle, Only Distance)", y = "Actual Donation") +
	theme_bw() +
	theme(legend.key = element_rect(colour = "black"))

ggarrange(triple.zero.plot, triple.hurdle.plot, hurdle.dist.plot, hurdle.rat.plot, ncol = 2, nrow = 2, 
	common.legend = TRUE, legend = "bottom")
dev.off()



#############################################
# Appendix Figure A2. Model errors
#############################################

# Look at the errors
data.ggplot.appendix.errors <- data.ggplot.appendix
data.ggplot.appendix.errors$ZeroInflPredsErrors <- data.ggplot.appendix.errors$Amount -data.ggplot.appendix.errors$ZeroInflPreds
data.ggplot.appendix.errors$HurdlePredsErrors <- data.ggplot.appendix.errors$Amount -data.ggplot.appendix.errors$HurdlePreds
data.ggplot.appendix.errors$HurdleRatPredsErrors <- data.ggplot.appendix.errors$Amount - data.ggplot.appendix.errors$HurdleRatPreds
data.ggplot.appendix.errors$HurdleDistPredsErrors <- data.ggplot.appendix.errors$Amount - data.ggplot.appendix.errors$HurdleDistPreds 

data.reduc.appendix.errors <- data.frame(subset(data.ggplot.appendix.errors, data.ggplot.appendix.errors$Amount < 200000))

names(data.reduc.appendix.errors)

error.hist1 <- ggplot(data = data.reduc.appendix.errors, aes(ZeroInflPredsErrors)) +
	geom_histogram() +
	xlim(-100000, 100000) + 
	ylim(0, 800) + 
	labs(x = "Errors (Zero-Inflated, Interactive)", y = "Count") +
	theme_bw()

error.hist2 <- ggplot(data = data.reduc.appendix.errors, aes(HurdlePredsErrors)) +
	geom_histogram() +
	xlim(-100000, 100000) + 
	ylim(0, 800) + 
	labs(x = "Errors (Hurdle, Interactive)", y = "Count") +
	theme_bw()

error.hist3 <- ggplot(data = data.reduc.appendix.errors, aes(HurdleRatPredsErrors)) +
	geom_histogram() +
	xlim(-100000, 100000) + 
	ylim(0, 800) + 
	labs(x = "Errors (Hurdle, Only Rating)", y = "Count") +
	theme_bw()

error.hist4 <- ggplot(data = data.reduc.appendix.errors, aes(HurdleDistPredsErrors)) +
	geom_histogram() +
	xlim(-100000, 100000) + 
	ylim(0, 800) + 
	labs(x = "Errors (Hurdle, Only Distance)", y = "Count") +
	theme_bw()

ggarrange(error.hist1, error.hist2, error.hist4, error.hist3, ncol = 2, nrow = 2, 
	common.legend = TRUE, legend = "bottom")
dev.off()



